{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_type</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           msno  \\\n",
       "0  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "2  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "3  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "4  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                        song_id source_system_tab  \\\n",
       "0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=           explore   \n",
       "1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=        my library   \n",
       "2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=        my library   \n",
       "3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=        my library   \n",
       "4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=           explore   \n",
       "\n",
       "    source_screen_name      source_type  target  \n",
       "0              Explore  online-playlist       1  \n",
       "1  Local playlist more   local-playlist       1  \n",
       "2  Local playlist more   local-playlist       1  \n",
       "3  Local playlist more   local-playlist       1  \n",
       "4              Explore  online-playlist       1  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv('train.csv')\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=</td>\n",
       "      <td>WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-library</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=</td>\n",
       "      <td>y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-library</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=</td>\n",
       "      <td>8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=</td>\n",
       "      <td>discover</td>\n",
       "      <td>NaN</td>\n",
       "      <td>song-based-playlist</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=</td>\n",
       "      <td>ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=</td>\n",
       "      <td>radio</td>\n",
       "      <td>Radio</td>\n",
       "      <td>radio</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=</td>\n",
       "      <td>MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=</td>\n",
       "      <td>radio</td>\n",
       "      <td>Radio</td>\n",
       "      <td>radio</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                          msno  \\\n",
       "0   0  V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=   \n",
       "1   1  V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=   \n",
       "2   2  /uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=   \n",
       "3   3  1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=   \n",
       "4   4  1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=   \n",
       "\n",
       "                                        song_id source_system_tab  \\\n",
       "0  WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=        my library   \n",
       "1  y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=        my library   \n",
       "2  8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=          discover   \n",
       "3  ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=             radio   \n",
       "4  MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=             radio   \n",
       "\n",
       "    source_screen_name          source_type  \n",
       "0  Local playlist more        local-library  \n",
       "1  Local playlist more        local-library  \n",
       "2                  NaN  song-based-playlist  \n",
       "3                Radio                radio  \n",
       "4                Radio                radio  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = pd.read_csv('test.csv')\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>city</th>\n",
       "      <th>bd</th>\n",
       "      <th>gender</th>\n",
       "      <th>registered_via</th>\n",
       "      <th>registration_init_time</th>\n",
       "      <th>expiration_date</th>\n",
       "      <th>expiration_date_year</th>\n",
       "      <th>expiration_date_month</th>\n",
       "      <th>expiration_date_Ym</th>\n",
       "      <th>use_days</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>7</td>\n",
       "      <td>20110820</td>\n",
       "      <td>20170920</td>\n",
       "      <td>2017</td>\n",
       "      <td>9</td>\n",
       "      <td>201709</td>\n",
       "      <td>2223</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>7</td>\n",
       "      <td>20150628</td>\n",
       "      <td>20170622</td>\n",
       "      <td>2017</td>\n",
       "      <td>6</td>\n",
       "      <td>201706</td>\n",
       "      <td>725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>4</td>\n",
       "      <td>20160411</td>\n",
       "      <td>20170712</td>\n",
       "      <td>2017</td>\n",
       "      <td>7</td>\n",
       "      <td>201707</td>\n",
       "      <td>457</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>9</td>\n",
       "      <td>20150906</td>\n",
       "      <td>20150907</td>\n",
       "      <td>2015</td>\n",
       "      <td>9</td>\n",
       "      <td>201509</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>4</td>\n",
       "      <td>20170126</td>\n",
       "      <td>20170613</td>\n",
       "      <td>2017</td>\n",
       "      <td>6</td>\n",
       "      <td>201706</td>\n",
       "      <td>138</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           msno  city  bd   gender  \\\n",
       "0  XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=     1 NaN  unknown   \n",
       "1  UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=     1 NaN  unknown   \n",
       "2  D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=     1 NaN  unknown   \n",
       "3  mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=     1 NaN  unknown   \n",
       "4  q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=     1 NaN  unknown   \n",
       "\n",
       "   registered_via  registration_init_time  expiration_date  \\\n",
       "0               7                20110820         20170920   \n",
       "1               7                20150628         20170622   \n",
       "2               4                20160411         20170712   \n",
       "3               9                20150906         20150907   \n",
       "4               4                20170126         20170613   \n",
       "\n",
       "   expiration_date_year  expiration_date_month  expiration_date_Ym  use_days  \n",
       "0                  2017                      9              201709      2223  \n",
       "1                  2017                      6              201706       725  \n",
       "2                  2017                      7              201707       457  \n",
       "3                  2015                      9              201509         1  \n",
       "4                  2017                      6              201706       138  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "members_data = pd.read_csv('FE_members.csv')\n",
    "members_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>city</th>\n",
       "      <th>bd</th>\n",
       "      <th>gender</th>\n",
       "      <th>registered_via</th>\n",
       "      <th>expiration_date_year</th>\n",
       "      <th>expiration_date_month</th>\n",
       "      <th>expiration_date_Ym</th>\n",
       "      <th>use_days</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>7</td>\n",
       "      <td>2017</td>\n",
       "      <td>9</td>\n",
       "      <td>201709</td>\n",
       "      <td>2223</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>7</td>\n",
       "      <td>2017</td>\n",
       "      <td>6</td>\n",
       "      <td>201706</td>\n",
       "      <td>725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>4</td>\n",
       "      <td>2017</td>\n",
       "      <td>7</td>\n",
       "      <td>201707</td>\n",
       "      <td>457</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>9</td>\n",
       "      <td>2015</td>\n",
       "      <td>9</td>\n",
       "      <td>201509</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>4</td>\n",
       "      <td>2017</td>\n",
       "      <td>6</td>\n",
       "      <td>201706</td>\n",
       "      <td>138</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           msno  city  bd   gender  \\\n",
       "0  XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=     1 NaN  unknown   \n",
       "1  UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=     1 NaN  unknown   \n",
       "2  D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=     1 NaN  unknown   \n",
       "3  mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=     1 NaN  unknown   \n",
       "4  q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=     1 NaN  unknown   \n",
       "\n",
       "   registered_via  expiration_date_year  expiration_date_month  \\\n",
       "0               7                  2017                      9   \n",
       "1               7                  2017                      6   \n",
       "2               4                  2017                      7   \n",
       "3               9                  2015                      9   \n",
       "4               4                  2017                      6   \n",
       "\n",
       "   expiration_date_Ym  use_days  \n",
       "0              201709      2223  \n",
       "1              201706       725  \n",
       "2              201707       457  \n",
       "3              201509         1  \n",
       "4              201706       138  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "members = members_data.drop(['registration_init_time','expiration_date'],axis =1)\n",
    "members.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>song_id</th>\n",
       "      <th>song_length</th>\n",
       "      <th>genre_ids</th>\n",
       "      <th>artist_name</th>\n",
       "      <th>composer</th>\n",
       "      <th>lyricist</th>\n",
       "      <th>language</th>\n",
       "      <th>song_length_s</th>\n",
       "      <th>song_length_s_log</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=</td>\n",
       "      <td>247640</td>\n",
       "      <td>465</td>\n",
       "      <td>張信哲 (Jeff Chang)</td>\n",
       "      <td>董貞</td>\n",
       "      <td>何啟弘</td>\n",
       "      <td>3.0</td>\n",
       "      <td>247</td>\n",
       "      <td>5.513429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=</td>\n",
       "      <td>197328</td>\n",
       "      <td>444</td>\n",
       "      <td>BLACKPINK</td>\n",
       "      <td>TEDDY|  FUTURE BOUNCE|  Bekuh BOOM</td>\n",
       "      <td>TEDDY</td>\n",
       "      <td>31.0</td>\n",
       "      <td>197</td>\n",
       "      <td>5.288267</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=</td>\n",
       "      <td>231781</td>\n",
       "      <td>465</td>\n",
       "      <td>SUPER JUNIOR</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31.0</td>\n",
       "      <td>231</td>\n",
       "      <td>5.446737</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=</td>\n",
       "      <td>273554</td>\n",
       "      <td>465</td>\n",
       "      <td>S.H.E</td>\n",
       "      <td>湯小康</td>\n",
       "      <td>徐世珍</td>\n",
       "      <td>3.0</td>\n",
       "      <td>273</td>\n",
       "      <td>5.613128</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=</td>\n",
       "      <td>140329</td>\n",
       "      <td>726</td>\n",
       "      <td>貴族精選</td>\n",
       "      <td>Traditional</td>\n",
       "      <td>Traditional</td>\n",
       "      <td>52.0</td>\n",
       "      <td>140</td>\n",
       "      <td>4.948760</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        song_id  song_length genre_ids  \\\n",
       "0  CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=       247640       465   \n",
       "1  o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=       197328       444   \n",
       "2  DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=       231781       465   \n",
       "3  dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=       273554       465   \n",
       "4  W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=       140329       726   \n",
       "\n",
       "        artist_name                            composer     lyricist  \\\n",
       "0  張信哲 (Jeff Chang)                                  董貞          何啟弘   \n",
       "1         BLACKPINK  TEDDY|  FUTURE BOUNCE|  Bekuh BOOM        TEDDY   \n",
       "2      SUPER JUNIOR                                 NaN          NaN   \n",
       "3             S.H.E                                 湯小康          徐世珍   \n",
       "4              貴族精選                         Traditional  Traditional   \n",
       "\n",
       "   language  song_length_s  song_length_s_log  \n",
       "0       3.0            247           5.513429  \n",
       "1      31.0            197           5.288267  \n",
       "2      31.0            231           5.446737  \n",
       "3       3.0            273           5.613128  \n",
       "4      52.0            140           4.948760  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "songs_data = pd.read_csv('FE_songs.csv')\n",
    "songs_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>song_id</th>\n",
       "      <th>language</th>\n",
       "      <th>song_length_s_log</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.513429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=</td>\n",
       "      <td>31.0</td>\n",
       "      <td>5.288267</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=</td>\n",
       "      <td>31.0</td>\n",
       "      <td>5.446737</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.613128</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=</td>\n",
       "      <td>52.0</td>\n",
       "      <td>4.948760</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        song_id  language  song_length_s_log\n",
       "0  CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=       3.0           5.513429\n",
       "1  o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=      31.0           5.288267\n",
       "2  DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=      31.0           5.446737\n",
       "3  dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=       3.0           5.613128\n",
       "4  W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=      52.0           4.948760"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "songs = songs_data.drop(['song_length','artist_name','composer','lyricist','song_length_s','genre_ids'],axis =1)\n",
    "songs.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 表合并"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\python37\\lib\\site-packages\\ipykernel_launcher.py:3: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=False'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
      "\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>source</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_type</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>train</td>\n",
       "      <td>Explore</td>\n",
       "      <td>explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>train</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>my library</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>train</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>my library</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>train</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>my library</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>train</td>\n",
       "      <td>Explore</td>\n",
       "      <td>explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                          msno  \\\n",
       "0 NaN  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1 NaN  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "2 NaN  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "3 NaN  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "4 NaN  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                        song_id source   source_screen_name  \\\n",
       "0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=  train              Explore   \n",
       "1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=  train  Local playlist more   \n",
       "2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=  train  Local playlist more   \n",
       "3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=  train  Local playlist more   \n",
       "4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=  train              Explore   \n",
       "\n",
       "  source_system_tab      source_type  target  \n",
       "0           explore  online-playlist     1.0  \n",
       "1        my library   local-playlist     1.0  \n",
       "2        my library   local-playlist     1.0  \n",
       "3        my library   local-playlist     1.0  \n",
       "4           explore  online-playlist     1.0  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['source'] = 'train'\n",
    "test['source'] = 'test'\n",
    "data = pd.concat([train,test],ignore_index = False)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(9934208, 8)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>source</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_type</th>\n",
       "      <th>target</th>\n",
       "      <th>city</th>\n",
       "      <th>bd</th>\n",
       "      <th>gender</th>\n",
       "      <th>registered_via</th>\n",
       "      <th>expiration_date_year</th>\n",
       "      <th>expiration_date_month</th>\n",
       "      <th>expiration_date_Ym</th>\n",
       "      <th>use_days</th>\n",
       "      <th>language</th>\n",
       "      <th>song_length_s_log</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>train</td>\n",
       "      <td>Explore</td>\n",
       "      <td>explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>7</td>\n",
       "      <td>2017</td>\n",
       "      <td>10</td>\n",
       "      <td>201710</td>\n",
       "      <td>2103</td>\n",
       "      <td>52.0</td>\n",
       "      <td>5.332719</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>train</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>my library</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1.0</td>\n",
       "      <td>13</td>\n",
       "      <td>24.0</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>2017</td>\n",
       "      <td>9</td>\n",
       "      <td>201709</td>\n",
       "      <td>2301</td>\n",
       "      <td>52.0</td>\n",
       "      <td>5.652489</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>train</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>my library</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1.0</td>\n",
       "      <td>13</td>\n",
       "      <td>24.0</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>2017</td>\n",
       "      <td>9</td>\n",
       "      <td>201709</td>\n",
       "      <td>2301</td>\n",
       "      <td>52.0</td>\n",
       "      <td>5.420535</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>train</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>my library</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1.0</td>\n",
       "      <td>13</td>\n",
       "      <td>24.0</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>2017</td>\n",
       "      <td>9</td>\n",
       "      <td>201709</td>\n",
       "      <td>2301</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>5.545177</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>train</td>\n",
       "      <td>Explore</td>\n",
       "      <td>explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>7</td>\n",
       "      <td>2017</td>\n",
       "      <td>10</td>\n",
       "      <td>201710</td>\n",
       "      <td>2103</td>\n",
       "      <td>52.0</td>\n",
       "      <td>5.236442</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                          msno  \\\n",
       "0 NaN  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1 NaN  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "2 NaN  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "3 NaN  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "4 NaN  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                        song_id source   source_screen_name  \\\n",
       "0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=  train              Explore   \n",
       "1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=  train  Local playlist more   \n",
       "2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=  train  Local playlist more   \n",
       "3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=  train  Local playlist more   \n",
       "4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=  train              Explore   \n",
       "\n",
       "  source_system_tab      source_type  target  city    bd   gender  \\\n",
       "0           explore  online-playlist     1.0     1   NaN  unknown   \n",
       "1        my library   local-playlist     1.0    13  24.0   female   \n",
       "2        my library   local-playlist     1.0    13  24.0   female   \n",
       "3        my library   local-playlist     1.0    13  24.0   female   \n",
       "4           explore  online-playlist     1.0     1   NaN  unknown   \n",
       "\n",
       "   registered_via  expiration_date_year  expiration_date_month  \\\n",
       "0               7                  2017                     10   \n",
       "1               9                  2017                      9   \n",
       "2               9                  2017                      9   \n",
       "3               9                  2017                      9   \n",
       "4               7                  2017                     10   \n",
       "\n",
       "   expiration_date_Ym  use_days  language  song_length_s_log  \n",
       "0              201710      2103      52.0           5.332719  \n",
       "1              201709      2301      52.0           5.652489  \n",
       "2              201709      2301      52.0           5.420535  \n",
       "3              201709      2301      -1.0           5.545177  \n",
       "4              201710      2103      52.0           5.236442  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.merge(data,members,how='left',on='msno')\n",
    "data = pd.merge(data,songs,how='left',on='song_id')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                       7377418\n",
       "msno                           0\n",
       "song_id                        0\n",
       "source                         0\n",
       "source_screen_name        577687\n",
       "source_system_tab          33291\n",
       "source_type                28836\n",
       "target                   2556790\n",
       "city                           0\n",
       "bd                       3994195\n",
       "gender                         0\n",
       "registered_via                 0\n",
       "expiration_date_year           0\n",
       "expiration_date_month          0\n",
       "expiration_date_Ym             0\n",
       "use_days                       0\n",
       "language                     139\n",
       "song_length_s_log            139\n",
       "dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                       7377418\n",
       "msno                           0\n",
       "song_id                        0\n",
       "source                         0\n",
       "source_screen_name        577687\n",
       "source_system_tab          33291\n",
       "source_type                28836\n",
       "target                   2556790\n",
       "city                           0\n",
       "bd                             0\n",
       "gender                         0\n",
       "registered_via                 0\n",
       "expiration_date_year           0\n",
       "expiration_date_month          0\n",
       "expiration_date_Ym             0\n",
       "use_days                       0\n",
       "language                       0\n",
       "song_length_s_log              0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['language'].fillna(0,inplace =True)\n",
    "data['song_length_s_log'].fillna(0,inplace = True)\n",
    "data['bd'].fillna(data.bd.median(),inplace = True)\n",
    "data.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                       7377418\n",
       "msno                           0\n",
       "song_id                        0\n",
       "source                         0\n",
       "source_screen_name             0\n",
       "source_system_tab              0\n",
       "source_type                    0\n",
       "target                   2556790\n",
       "city                           0\n",
       "bd                             0\n",
       "gender                         0\n",
       "registered_via                 0\n",
       "expiration_date_year           0\n",
       "expiration_date_month          0\n",
       "expiration_date_Ym             0\n",
       "use_days                       0\n",
       "language                       0\n",
       "song_length_s_log              0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n = ['source_screen_name','source_system_tab','source_type']\n",
    "for col in n:\n",
    "    data[col].fillna('unknown',inplace = True)\n",
    "data.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = data.drop('expiration_date_Ym',axis =1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17 Index(['id', 'msno', 'song_id', 'source', 'source_screen_name',\n",
      "       'source_system_tab', 'source_type', 'target', 'city', 'bd', 'gender',\n",
      "       'registered_via', 'expiration_date_year', 'expiration_date_month',\n",
      "       'use_days', 'language', 'song_length_s_log'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "print(len(data.columns),data.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 9个类别特征\n",
    "# 3个数值型特征 bd,use_days,song_length_s_log\n",
    "# 4 msno,song_id,target,source,id\n",
    "cat_features = ['source_screen_name','source_system_tab','source_type','city','gender','registered_via','expiration_date_year',\n",
    "               'expiration_date_month','language']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "source_screen_name 23\n",
      "source_system_tab 9\n",
      "source_type 13\n",
      "city 21\n",
      "gender 3\n",
      "registered_via 6\n",
      "expiration_date_year 17\n",
      "expiration_date_month 12\n",
      "language 11\n"
     ]
    }
   ],
   "source": [
    "for col in cat_features:\n",
    "    print(col,len(data[col].unique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for col in cat_features:\n",
    "    data[col] = data[col].astype(object)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_screen_name_Album more</th>\n",
       "      <th>source_screen_name_Artist more</th>\n",
       "      <th>source_screen_name_Concert</th>\n",
       "      <th>source_screen_name_Discover Chart</th>\n",
       "      <th>source_screen_name_Discover Feature</th>\n",
       "      <th>source_screen_name_Discover Genre</th>\n",
       "      <th>source_screen_name_Discover New</th>\n",
       "      <th>source_screen_name_Explore</th>\n",
       "      <th>source_screen_name_Local playlist more</th>\n",
       "      <th>source_screen_name_My library</th>\n",
       "      <th>...</th>\n",
       "      <th>language_0.0</th>\n",
       "      <th>language_3.0</th>\n",
       "      <th>language_10.0</th>\n",
       "      <th>language_17.0</th>\n",
       "      <th>language_24.0</th>\n",
       "      <th>language_31.0</th>\n",
       "      <th>language_38.0</th>\n",
       "      <th>language_45.0</th>\n",
       "      <th>language_52.0</th>\n",
       "      <th>language_59.0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 115 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   source_screen_name_Album more  source_screen_name_Artist more  \\\n",
       "0                              0                               0   \n",
       "1                              0                               0   \n",
       "2                              0                               0   \n",
       "3                              0                               0   \n",
       "4                              0                               0   \n",
       "\n",
       "   source_screen_name_Concert  source_screen_name_Discover Chart  \\\n",
       "0                           0                                  0   \n",
       "1                           0                                  0   \n",
       "2                           0                                  0   \n",
       "3                           0                                  0   \n",
       "4                           0                                  0   \n",
       "\n",
       "   source_screen_name_Discover Feature  source_screen_name_Discover Genre  \\\n",
       "0                                    0                                  0   \n",
       "1                                    0                                  0   \n",
       "2                                    0                                  0   \n",
       "3                                    0                                  0   \n",
       "4                                    0                                  0   \n",
       "\n",
       "   source_screen_name_Discover New  source_screen_name_Explore  \\\n",
       "0                                0                           1   \n",
       "1                                0                           0   \n",
       "2                                0                           0   \n",
       "3                                0                           0   \n",
       "4                                0                           1   \n",
       "\n",
       "   source_screen_name_Local playlist more  source_screen_name_My library  ...  \\\n",
       "0                                       0                              0  ...   \n",
       "1                                       1                              0  ...   \n",
       "2                                       1                              0  ...   \n",
       "3                                       1                              0  ...   \n",
       "4                                       0                              0  ...   \n",
       "\n",
       "   language_0.0  language_3.0  language_10.0  language_17.0  language_24.0  \\\n",
       "0             0             0              0              0              0   \n",
       "1             0             0              0              0              0   \n",
       "2             0             0              0              0              0   \n",
       "3             0             0              0              0              0   \n",
       "4             0             0              0              0              0   \n",
       "\n",
       "   language_31.0  language_38.0  language_45.0  language_52.0  language_59.0  \n",
       "0              0              0              0              1              0  \n",
       "1              0              0              0              1              0  \n",
       "2              0              0              0              1              0  \n",
       "3              0              0              0              0              0  \n",
       "4              0              0              0              1              0  \n",
       "\n",
       "[5 rows x 115 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_cat = pd.get_dummies(data[cat_features])\n",
    "data_cat.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_num =data[['bd','use_days','song_length_s_log']]\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "mms = MinMaxScaler()\n",
    "data_num = mms.fit_transform(data_num)\n",
    "data_mms = pd.DataFrame(data = data_num,columns = ['bd','use_days','song_length_s_log'],index = data.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>target</th>\n",
       "      <th>source</th>\n",
       "      <th>bd</th>\n",
       "      <th>use_days</th>\n",
       "      <th>song_length_s_log</th>\n",
       "      <th>source_screen_name_Album more</th>\n",
       "      <th>source_screen_name_Artist more</th>\n",
       "      <th>...</th>\n",
       "      <th>language_0.0</th>\n",
       "      <th>language_3.0</th>\n",
       "      <th>language_10.0</th>\n",
       "      <th>language_17.0</th>\n",
       "      <th>language_24.0</th>\n",
       "      <th>language_31.0</th>\n",
       "      <th>language_38.0</th>\n",
       "      <th>language_45.0</th>\n",
       "      <th>language_52.0</th>\n",
       "      <th>language_59.0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>train</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.408429</td>\n",
       "      <td>0.573898</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>train</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.446883</td>\n",
       "      <td>0.608311</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>train</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.446883</td>\n",
       "      <td>0.583348</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>train</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.446883</td>\n",
       "      <td>0.596762</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>train</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.408429</td>\n",
       "      <td>0.563537</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 123 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                          msno  \\\n",
       "0 NaN  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1 NaN  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "2 NaN  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "3 NaN  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "4 NaN  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                        song_id  target source        bd  \\\n",
       "0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=     1.0  train  0.285714   \n",
       "1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=     1.0  train  0.250000   \n",
       "2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=     1.0  train  0.250000   \n",
       "3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=     1.0  train  0.250000   \n",
       "4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=     1.0  train  0.285714   \n",
       "\n",
       "   use_days  song_length_s_log  source_screen_name_Album more  \\\n",
       "0  0.408429           0.573898                              0   \n",
       "1  0.446883           0.608311                              0   \n",
       "2  0.446883           0.583348                              0   \n",
       "3  0.446883           0.596762                              0   \n",
       "4  0.408429           0.563537                              0   \n",
       "\n",
       "   source_screen_name_Artist more  ...  language_0.0  language_3.0  \\\n",
       "0                               0  ...             0             0   \n",
       "1                               0  ...             0             0   \n",
       "2                               0  ...             0             0   \n",
       "3                               0  ...             0             0   \n",
       "4                               0  ...             0             0   \n",
       "\n",
       "   language_10.0  language_17.0  language_24.0  language_31.0  language_38.0  \\\n",
       "0              0              0              0              0              0   \n",
       "1              0              0              0              0              0   \n",
       "2              0              0              0              0              0   \n",
       "3              0              0              0              0              0   \n",
       "4              0              0              0              0              0   \n",
       "\n",
       "   language_45.0  language_52.0  language_59.0  \n",
       "0              0              1              0  \n",
       "1              0              1              0  \n",
       "2              0              1              0  \n",
       "3              0              0              0  \n",
       "4              0              1              0  \n",
       "\n",
       "[5 rows x 123 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.concat([data[['id','msno','song_id','target','source']],data_mms,data_cat],axis=1)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train = data.loc[data['source'] == 'train']\n",
    "test = data.loc[data['source'] == 'test']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\python37\\lib\\site-packages\\pandas\\core\\frame.py:3940: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  errors=errors)\n"
     ]
    }
   ],
   "source": [
    "train.drop(['source','id'],axis =1,inplace = True)\n",
    "test.drop(['source','target'],axis = 1,inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>target</th>\n",
       "      <th>bd</th>\n",
       "      <th>use_days</th>\n",
       "      <th>song_length_s_log</th>\n",
       "      <th>source_screen_name_Album more</th>\n",
       "      <th>source_screen_name_Artist more</th>\n",
       "      <th>source_screen_name_Concert</th>\n",
       "      <th>source_screen_name_Discover Chart</th>\n",
       "      <th>...</th>\n",
       "      <th>language_0.0</th>\n",
       "      <th>language_3.0</th>\n",
       "      <th>language_10.0</th>\n",
       "      <th>language_17.0</th>\n",
       "      <th>language_24.0</th>\n",
       "      <th>language_31.0</th>\n",
       "      <th>language_38.0</th>\n",
       "      <th>language_45.0</th>\n",
       "      <th>language_52.0</th>\n",
       "      <th>language_59.0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.408429</td>\n",
       "      <td>0.573898</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.446883</td>\n",
       "      <td>0.608311</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.446883</td>\n",
       "      <td>0.583348</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.446883</td>\n",
       "      <td>0.596762</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.408429</td>\n",
       "      <td>0.563537</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 121 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           msno  \\\n",
       "0  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "2  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "3  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "4  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                        song_id  target        bd  use_days  \\\n",
       "0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=     1.0  0.285714  0.408429   \n",
       "1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=     1.0  0.250000  0.446883   \n",
       "2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=     1.0  0.250000  0.446883   \n",
       "3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=     1.0  0.250000  0.446883   \n",
       "4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=     1.0  0.285714  0.408429   \n",
       "\n",
       "   song_length_s_log  source_screen_name_Album more  \\\n",
       "0           0.573898                              0   \n",
       "1           0.608311                              0   \n",
       "2           0.583348                              0   \n",
       "3           0.596762                              0   \n",
       "4           0.563537                              0   \n",
       "\n",
       "   source_screen_name_Artist more  source_screen_name_Concert  \\\n",
       "0                               0                           0   \n",
       "1                               0                           0   \n",
       "2                               0                           0   \n",
       "3                               0                           0   \n",
       "4                               0                           0   \n",
       "\n",
       "   source_screen_name_Discover Chart  ...  language_0.0  language_3.0  \\\n",
       "0                                  0  ...             0             0   \n",
       "1                                  0  ...             0             0   \n",
       "2                                  0  ...             0             0   \n",
       "3                                  0  ...             0             0   \n",
       "4                                  0  ...             0             0   \n",
       "\n",
       "   language_10.0  language_17.0  language_24.0  language_31.0  language_38.0  \\\n",
       "0              0              0              0              0              0   \n",
       "1              0              0              0              0              0   \n",
       "2              0              0              0              0              0   \n",
       "3              0              0              0              0              0   \n",
       "4              0              0              0              0              0   \n",
       "\n",
       "   language_45.0  language_52.0  language_59.0  \n",
       "0              0              1              0  \n",
       "1              0              1              0  \n",
       "2              0              1              0  \n",
       "3              0              0              0  \n",
       "4              0              1              0  \n",
       "\n",
       "[5 rows x 121 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\python37\\lib\\site-packages\\pandas\\core\\generic.py:5096: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  self[name] = value\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>bd</th>\n",
       "      <th>use_days</th>\n",
       "      <th>song_length_s_log</th>\n",
       "      <th>source_screen_name_Album more</th>\n",
       "      <th>source_screen_name_Artist more</th>\n",
       "      <th>source_screen_name_Concert</th>\n",
       "      <th>source_screen_name_Discover Chart</th>\n",
       "      <th>...</th>\n",
       "      <th>language_0.0</th>\n",
       "      <th>language_3.0</th>\n",
       "      <th>language_10.0</th>\n",
       "      <th>language_17.0</th>\n",
       "      <th>language_24.0</th>\n",
       "      <th>language_31.0</th>\n",
       "      <th>language_38.0</th>\n",
       "      <th>language_45.0</th>\n",
       "      <th>language_52.0</th>\n",
       "      <th>language_59.0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=</td>\n",
       "      <td>WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.112061</td>\n",
       "      <td>0.582871</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=</td>\n",
       "      <td>y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.112061</td>\n",
       "      <td>0.621112</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=</td>\n",
       "      <td>8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.001359</td>\n",
       "      <td>0.619423</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=</td>\n",
       "      <td>ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=</td>\n",
       "      <td>0.321429</td>\n",
       "      <td>0.692756</td>\n",
       "      <td>0.608688</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=</td>\n",
       "      <td>MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=</td>\n",
       "      <td>0.321429</td>\n",
       "      <td>0.692756</td>\n",
       "      <td>0.569114</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 121 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                          msno  \\\n",
       "0   0  V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=   \n",
       "1   1  V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=   \n",
       "2   2  /uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=   \n",
       "3   3  1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=   \n",
       "4   4  1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=   \n",
       "\n",
       "                                        song_id        bd  use_days  \\\n",
       "0  WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=  0.285714  0.112061   \n",
       "1  y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=  0.285714  0.112061   \n",
       "2  8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=  0.285714  0.001359   \n",
       "3  ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=  0.321429  0.692756   \n",
       "4  MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=  0.321429  0.692756   \n",
       "\n",
       "   song_length_s_log  source_screen_name_Album more  \\\n",
       "0           0.582871                              0   \n",
       "1           0.621112                              0   \n",
       "2           0.619423                              0   \n",
       "3           0.608688                              0   \n",
       "4           0.569114                              0   \n",
       "\n",
       "   source_screen_name_Artist more  source_screen_name_Concert  \\\n",
       "0                               0                           0   \n",
       "1                               0                           0   \n",
       "2                               0                           0   \n",
       "3                               0                           0   \n",
       "4                               0                           0   \n",
       "\n",
       "   source_screen_name_Discover Chart  ...  language_0.0  language_3.0  \\\n",
       "0                                  0  ...             0             1   \n",
       "1                                  0  ...             0             1   \n",
       "2                                  0  ...             0             0   \n",
       "3                                  0  ...             0             0   \n",
       "4                                  0  ...             0             0   \n",
       "\n",
       "   language_10.0  language_17.0  language_24.0  language_31.0  language_38.0  \\\n",
       "0              0              0              0              0              0   \n",
       "1              0              0              0              0              0   \n",
       "2              0              1              0              0              0   \n",
       "3              0              0              0              0              0   \n",
       "4              0              0              0              0              0   \n",
       "\n",
       "   language_45.0  language_52.0  language_59.0  \n",
       "0              0              0              0  \n",
       "1              0              0              0  \n",
       "2              0              0              0  \n",
       "3              0              1              0  \n",
       "4              0              0              0  \n",
       "\n",
       "[5 rows x 121 columns]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.reset_index(drop = True,inplace =True)\n",
    "test.id = test.id.astype('int64')\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "FE_train = pd.read_csv('FE_train_LGBM.csv')\n",
    "FE_test = pd.read_csv('FE_test_LGBM.csv')\n",
    "names = ['genre_ids_counts', 'artist_name_counts',\n",
    "       'song_id_counts', 'msno_counts']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_num = FE_train[['genre_ids_counts','artist_name_counts','song_id_counts','msno_counts']]\n",
    "mms = MinMaxScaler()\n",
    "train_num = mms.fit_transform(train_num)\n",
    "train_mms = pd.DataFrame(data = train_num,columns = names)\n",
    "train = pd.concat([train,train_mms],axis =1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_num = FE_test[names]\n",
    "mms = MinMaxScaler()\n",
    "test_num = mms.fit_transform(test_num)\n",
    "test_mms = pd.DataFrame(data = test_num,columns = names)\n",
    "test = pd.concat([test,test_mms],axis =1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "msno                                            0\n",
       "song_id                                         0\n",
       "target                                          0\n",
       "bd                                              0\n",
       "use_days                                        0\n",
       "song_length_s_log                               0\n",
       "source_screen_name_Album more                   0\n",
       "source_screen_name_Artist more                  0\n",
       "source_screen_name_Concert                      0\n",
       "source_screen_name_Discover Chart               0\n",
       "source_screen_name_Discover Feature             0\n",
       "source_screen_name_Discover Genre               0\n",
       "source_screen_name_Discover New                 0\n",
       "source_screen_name_Explore                      0\n",
       "source_screen_name_Local playlist more          0\n",
       "source_screen_name_My library                   0\n",
       "source_screen_name_My library_Search            0\n",
       "source_screen_name_Online playlist more         0\n",
       "source_screen_name_Others profile more          0\n",
       "source_screen_name_Payment                      0\n",
       "source_screen_name_People global                0\n",
       "source_screen_name_People local                 0\n",
       "source_screen_name_Radio                        0\n",
       "source_screen_name_Search                       0\n",
       "source_screen_name_Search Home                  0\n",
       "source_screen_name_Search Trends                0\n",
       "source_screen_name_Self profile more            0\n",
       "source_screen_name_Unknown                      0\n",
       "source_screen_name_unknown                      0\n",
       "source_system_tab_discover                      0\n",
       "                                            ...  \n",
       "expiration_date_year_2018                       0\n",
       "expiration_date_year_2019                       0\n",
       "expiration_date_year_2020                       0\n",
       "expiration_date_month_1                         0\n",
       "expiration_date_month_2                         0\n",
       "expiration_date_month_3                         0\n",
       "expiration_date_month_4                         0\n",
       "expiration_date_month_5                         0\n",
       "expiration_date_month_6                         0\n",
       "expiration_date_month_7                         0\n",
       "expiration_date_month_8                         0\n",
       "expiration_date_month_9                         0\n",
       "expiration_date_month_10                        0\n",
       "expiration_date_month_11                        0\n",
       "expiration_date_month_12                        0\n",
       "language_-1.0                                   0\n",
       "language_0.0                                    0\n",
       "language_3.0                                    0\n",
       "language_10.0                                   0\n",
       "language_17.0                                   0\n",
       "language_24.0                                   0\n",
       "language_31.0                                   0\n",
       "language_38.0                                   0\n",
       "language_45.0                                   0\n",
       "language_52.0                                   0\n",
       "language_59.0                                   0\n",
       "genre_ids_counts                           118455\n",
       "artist_name_counts                            114\n",
       "song_id_counts                                  0\n",
       "msno_counts                                     0\n",
       "Length: 125, dtype: int64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                                             0\n",
       "msno                                           0\n",
       "song_id                                        0\n",
       "bd                                             0\n",
       "use_days                                       0\n",
       "song_length_s_log                              0\n",
       "source_screen_name_Album more                  0\n",
       "source_screen_name_Artist more                 0\n",
       "source_screen_name_Concert                     0\n",
       "source_screen_name_Discover Chart              0\n",
       "source_screen_name_Discover Feature            0\n",
       "source_screen_name_Discover Genre              0\n",
       "source_screen_name_Discover New                0\n",
       "source_screen_name_Explore                     0\n",
       "source_screen_name_Local playlist more         0\n",
       "source_screen_name_My library                  0\n",
       "source_screen_name_My library_Search           0\n",
       "source_screen_name_Online playlist more        0\n",
       "source_screen_name_Others profile more         0\n",
       "source_screen_name_Payment                     0\n",
       "source_screen_name_People global               0\n",
       "source_screen_name_People local                0\n",
       "source_screen_name_Radio                       0\n",
       "source_screen_name_Search                      0\n",
       "source_screen_name_Search Home                 0\n",
       "source_screen_name_Search Trends               0\n",
       "source_screen_name_Self profile more           0\n",
       "source_screen_name_Unknown                     0\n",
       "source_screen_name_unknown                     0\n",
       "source_system_tab_discover                     0\n",
       "                                           ...  \n",
       "expiration_date_year_2018                      0\n",
       "expiration_date_year_2019                      0\n",
       "expiration_date_year_2020                      0\n",
       "expiration_date_month_1                        0\n",
       "expiration_date_month_2                        0\n",
       "expiration_date_month_3                        0\n",
       "expiration_date_month_4                        0\n",
       "expiration_date_month_5                        0\n",
       "expiration_date_month_6                        0\n",
       "expiration_date_month_7                        0\n",
       "expiration_date_month_8                        0\n",
       "expiration_date_month_9                        0\n",
       "expiration_date_month_10                       0\n",
       "expiration_date_month_11                       0\n",
       "expiration_date_month_12                       0\n",
       "language_-1.0                                  0\n",
       "language_0.0                                   0\n",
       "language_3.0                                   0\n",
       "language_10.0                                  0\n",
       "language_17.0                                  0\n",
       "language_24.0                                  0\n",
       "language_31.0                                  0\n",
       "language_38.0                                  0\n",
       "language_45.0                                  0\n",
       "language_52.0                                  0\n",
       "language_59.0                                  0\n",
       "genre_ids_counts                           42110\n",
       "artist_name_counts                            25\n",
       "song_id_counts                                 0\n",
       "msno_counts                                    0\n",
       "Length: 125, dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train.fillna(0,inplace = True)\n",
    "test.fillna(0,inplace =True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>target</th>\n",
       "      <th>bd</th>\n",
       "      <th>use_days</th>\n",
       "      <th>song_length_s_log</th>\n",
       "      <th>source_screen_name_Album more</th>\n",
       "      <th>source_screen_name_Artist more</th>\n",
       "      <th>source_screen_name_Concert</th>\n",
       "      <th>source_screen_name_Discover Chart</th>\n",
       "      <th>...</th>\n",
       "      <th>language_24.0</th>\n",
       "      <th>language_31.0</th>\n",
       "      <th>language_38.0</th>\n",
       "      <th>language_45.0</th>\n",
       "      <th>language_52.0</th>\n",
       "      <th>language_59.0</th>\n",
       "      <th>genre_ids_counts</th>\n",
       "      <th>artist_name_counts</th>\n",
       "      <th>song_id_counts</th>\n",
       "      <th>msno_counts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.408429</td>\n",
       "      <td>0.573898</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.031459</td>\n",
       "      <td>0.003083</td>\n",
       "      <td>0.015545</td>\n",
       "      <td>0.896237</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.446883</td>\n",
       "      <td>0.608311</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.049582</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.092360</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.446883</td>\n",
       "      <td>0.583348</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.049582</td>\n",
       "      <td>0.000985</td>\n",
       "      <td>0.000312</td>\n",
       "      <td>0.092360</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.446883</td>\n",
       "      <td>0.596762</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000026</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.092360</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.408429</td>\n",
       "      <td>0.563537</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.008982</td>\n",
       "      <td>0.001133</td>\n",
       "      <td>0.029529</td>\n",
       "      <td>0.896237</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 125 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           msno  \\\n",
       "0  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "2  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "3  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "4  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                        song_id  target        bd  use_days  \\\n",
       "0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=     1.0  0.285714  0.408429   \n",
       "1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=     1.0  0.250000  0.446883   \n",
       "2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=     1.0  0.250000  0.446883   \n",
       "3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=     1.0  0.250000  0.446883   \n",
       "4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=     1.0  0.285714  0.408429   \n",
       "\n",
       "   song_length_s_log  source_screen_name_Album more  \\\n",
       "0           0.573898                              0   \n",
       "1           0.608311                              0   \n",
       "2           0.583348                              0   \n",
       "3           0.596762                              0   \n",
       "4           0.563537                              0   \n",
       "\n",
       "   source_screen_name_Artist more  source_screen_name_Concert  \\\n",
       "0                               0                           0   \n",
       "1                               0                           0   \n",
       "2                               0                           0   \n",
       "3                               0                           0   \n",
       "4                               0                           0   \n",
       "\n",
       "   source_screen_name_Discover Chart  ...  language_24.0  language_31.0  \\\n",
       "0                                  0  ...              0              0   \n",
       "1                                  0  ...              0              0   \n",
       "2                                  0  ...              0              0   \n",
       "3                                  0  ...              0              0   \n",
       "4                                  0  ...              0              0   \n",
       "\n",
       "   language_38.0  language_45.0  language_52.0  language_59.0  \\\n",
       "0              0              0              1              0   \n",
       "1              0              0              1              0   \n",
       "2              0              0              1              0   \n",
       "3              0              0              0              0   \n",
       "4              0              0              1              0   \n",
       "\n",
       "   genre_ids_counts  artist_name_counts  song_id_counts  msno_counts  \n",
       "0          0.031459            0.003083        0.015545     0.896237  \n",
       "1          0.049582            1.000000        0.000000     0.092360  \n",
       "2          0.049582            0.000985        0.000312     0.092360  \n",
       "3          0.000026            0.000000        0.000000     0.092360  \n",
       "4          0.008982            0.001133        0.029529     0.896237  \n",
       "\n",
       "[5 rows x 125 columns]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# del data,FE_train, FE_test, train_num,test_num,train_mms, test_mms, members_data, members, songs_data, songs, data_cat, data_num, data_mms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>bd</th>\n",
       "      <th>use_days</th>\n",
       "      <th>song_length_s_log</th>\n",
       "      <th>source_screen_name_Album more</th>\n",
       "      <th>source_screen_name_Artist more</th>\n",
       "      <th>source_screen_name_Concert</th>\n",
       "      <th>source_screen_name_Discover Chart</th>\n",
       "      <th>...</th>\n",
       "      <th>language_24.0</th>\n",
       "      <th>language_31.0</th>\n",
       "      <th>language_38.0</th>\n",
       "      <th>language_45.0</th>\n",
       "      <th>language_52.0</th>\n",
       "      <th>language_59.0</th>\n",
       "      <th>genre_ids_counts</th>\n",
       "      <th>artist_name_counts</th>\n",
       "      <th>song_id_counts</th>\n",
       "      <th>msno_counts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=</td>\n",
       "      <td>WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.112061</td>\n",
       "      <td>0.582871</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.332630</td>\n",
       "      <td>0.038052</td>\n",
       "      <td>0.055500</td>\n",
       "      <td>0.020018</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=</td>\n",
       "      <td>y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.112061</td>\n",
       "      <td>0.621112</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.316476</td>\n",
       "      <td>0.472468</td>\n",
       "      <td>0.020018</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=</td>\n",
       "      <td>8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.001359</td>\n",
       "      <td>0.619423</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.042051</td>\n",
       "      <td>0.002450</td>\n",
       "      <td>0.000375</td>\n",
       "      <td>0.016724</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=</td>\n",
       "      <td>ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=</td>\n",
       "      <td>0.321429</td>\n",
       "      <td>0.692756</td>\n",
       "      <td>0.608688</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.001986</td>\n",
       "      <td>0.002747</td>\n",
       "      <td>0.083238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=</td>\n",
       "      <td>MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=</td>\n",
       "      <td>0.321429</td>\n",
       "      <td>0.692756</td>\n",
       "      <td>0.569114</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.001863</td>\n",
       "      <td>0.000558</td>\n",
       "      <td>0.000437</td>\n",
       "      <td>0.083238</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 125 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                          msno  \\\n",
       "0   0  V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=   \n",
       "1   1  V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=   \n",
       "2   2  /uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=   \n",
       "3   3  1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=   \n",
       "4   4  1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=   \n",
       "\n",
       "                                        song_id        bd  use_days  \\\n",
       "0  WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=  0.285714  0.112061   \n",
       "1  y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=  0.285714  0.112061   \n",
       "2  8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=  0.285714  0.001359   \n",
       "3  ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=  0.321429  0.692756   \n",
       "4  MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=  0.321429  0.692756   \n",
       "\n",
       "   song_length_s_log  source_screen_name_Album more  \\\n",
       "0           0.582871                              0   \n",
       "1           0.621112                              0   \n",
       "2           0.619423                              0   \n",
       "3           0.608688                              0   \n",
       "4           0.569114                              0   \n",
       "\n",
       "   source_screen_name_Artist more  source_screen_name_Concert  \\\n",
       "0                               0                           0   \n",
       "1                               0                           0   \n",
       "2                               0                           0   \n",
       "3                               0                           0   \n",
       "4                               0                           0   \n",
       "\n",
       "   source_screen_name_Discover Chart  ...  language_24.0  language_31.0  \\\n",
       "0                                  0  ...              0              0   \n",
       "1                                  0  ...              0              0   \n",
       "2                                  0  ...              0              0   \n",
       "3                                  0  ...              0              0   \n",
       "4                                  0  ...              0              0   \n",
       "\n",
       "   language_38.0  language_45.0  language_52.0  language_59.0  \\\n",
       "0              0              0              0              0   \n",
       "1              0              0              0              0   \n",
       "2              0              0              0              0   \n",
       "3              0              0              1              0   \n",
       "4              0              0              0              0   \n",
       "\n",
       "   genre_ids_counts  artist_name_counts  song_id_counts  msno_counts  \n",
       "0          0.332630            0.038052        0.055500     0.020018  \n",
       "1          1.000000            0.316476        0.472468     0.020018  \n",
       "2          0.042051            0.002450        0.000375     0.016724  \n",
       "3          1.000000            0.001986        0.002747     0.083238  \n",
       "4          0.001863            0.000558        0.000437     0.083238  \n",
       "\n",
       "[5 rows x 125 columns]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### logistic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import GridSearchCV\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = train.drop(['target','msno','song_id'],axis =1)\n",
    "y = train['target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,\n",
       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
       "                   multi_class='warn', n_jobs=None, penalty='l1',\n",
       "                   random_state=None, solver='saga', tol=0.0001, verbose=0,\n",
       "                   warm_start=False)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lr = LogisticRegression(penalty='l1', C =0.1,solver='saga')\n",
    "lr.fit(X,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bd</th>\n",
       "      <th>use_days</th>\n",
       "      <th>song_length_s_log</th>\n",
       "      <th>source_screen_name_Album more</th>\n",
       "      <th>source_screen_name_Artist more</th>\n",
       "      <th>source_screen_name_Concert</th>\n",
       "      <th>source_screen_name_Discover Chart</th>\n",
       "      <th>source_screen_name_Discover Feature</th>\n",
       "      <th>source_screen_name_Discover Genre</th>\n",
       "      <th>source_screen_name_Discover New</th>\n",
       "      <th>...</th>\n",
       "      <th>language_24.0</th>\n",
       "      <th>language_31.0</th>\n",
       "      <th>language_38.0</th>\n",
       "      <th>language_45.0</th>\n",
       "      <th>language_52.0</th>\n",
       "      <th>language_59.0</th>\n",
       "      <th>genre_ids_counts</th>\n",
       "      <th>artist_name_counts</th>\n",
       "      <th>song_id_counts</th>\n",
       "      <th>msno_counts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.112061</td>\n",
       "      <td>0.582871</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.332630</td>\n",
       "      <td>0.038052</td>\n",
       "      <td>0.055500</td>\n",
       "      <td>0.020018</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.112061</td>\n",
       "      <td>0.621112</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.316476</td>\n",
       "      <td>0.472468</td>\n",
       "      <td>0.020018</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.001359</td>\n",
       "      <td>0.619423</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.042051</td>\n",
       "      <td>0.002450</td>\n",
       "      <td>0.000375</td>\n",
       "      <td>0.016724</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.321429</td>\n",
       "      <td>0.692756</td>\n",
       "      <td>0.608688</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.001986</td>\n",
       "      <td>0.002747</td>\n",
       "      <td>0.083238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.321429</td>\n",
       "      <td>0.692756</td>\n",
       "      <td>0.569114</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.001863</td>\n",
       "      <td>0.000558</td>\n",
       "      <td>0.000437</td>\n",
       "      <td>0.083238</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 122 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         bd  use_days  song_length_s_log  source_screen_name_Album more  \\\n",
       "0  0.285714  0.112061           0.582871                              0   \n",
       "1  0.285714  0.112061           0.621112                              0   \n",
       "2  0.285714  0.001359           0.619423                              0   \n",
       "3  0.321429  0.692756           0.608688                              0   \n",
       "4  0.321429  0.692756           0.569114                              0   \n",
       "\n",
       "   source_screen_name_Artist more  source_screen_name_Concert  \\\n",
       "0                               0                           0   \n",
       "1                               0                           0   \n",
       "2                               0                           0   \n",
       "3                               0                           0   \n",
       "4                               0                           0   \n",
       "\n",
       "   source_screen_name_Discover Chart  source_screen_name_Discover Feature  \\\n",
       "0                                  0                                    0   \n",
       "1                                  0                                    0   \n",
       "2                                  0                                    0   \n",
       "3                                  0                                    0   \n",
       "4                                  0                                    0   \n",
       "\n",
       "   source_screen_name_Discover Genre  source_screen_name_Discover New  ...  \\\n",
       "0                                  0                                0  ...   \n",
       "1                                  0                                0  ...   \n",
       "2                                  0                                0  ...   \n",
       "3                                  0                                0  ...   \n",
       "4                                  0                                0  ...   \n",
       "\n",
       "   language_24.0  language_31.0  language_38.0  language_45.0  language_52.0  \\\n",
       "0              0              0              0              0              0   \n",
       "1              0              0              0              0              0   \n",
       "2              0              0              0              0              0   \n",
       "3              0              0              0              0              1   \n",
       "4              0              0              0              0              0   \n",
       "\n",
       "   language_59.0  genre_ids_counts  artist_name_counts  song_id_counts  \\\n",
       "0              0          0.332630            0.038052        0.055500   \n",
       "1              0          1.000000            0.316476        0.472468   \n",
       "2              0          0.042051            0.002450        0.000375   \n",
       "3              0          1.000000            0.001986        0.002747   \n",
       "4              0          0.001863            0.000558        0.000437   \n",
       "\n",
       "   msno_counts  \n",
       "0     0.020018  \n",
       "1     0.020018  \n",
       "2     0.016724  \n",
       "3     0.083238  \n",
       "4     0.083238  \n",
       "\n",
       "[5 rows x 122 columns]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test = test.drop(['id','msno','song_id'],axis=1)\n",
    "X_test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred_lr = lr.predict_proba(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.39240957, 0.60759043],\n",
       "       [0.20325307, 0.79674693],\n",
       "       [0.82410641, 0.17589359],\n",
       "       ...,\n",
       "       [0.58841612, 0.41158388],\n",
       "       [0.59440554, 0.40559446],\n",
       "       [0.57966341, 0.42033659]])"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred_lr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred_lr = [x[1] for x in y_pred_lr]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0.607590</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0.796747</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>0.175894</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>0.149185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>0.155150</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id    target\n",
       "0   0  0.607590\n",
       "1   1  0.796747\n",
       "2   2  0.175894\n",
       "3   3  0.149185\n",
       "4   4  0.155150"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result_lr = pd.DataFrame({'id':test.id ,'target':y_pred_lr})\n",
    "result_lr.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 2556790 entries, 0 to 2556789\n",
      "Data columns (total 2 columns):\n",
      "id        int64\n",
      "target    float64\n",
      "dtypes: float64(1), int64(1)\n",
      "memory usage: 39.0 MB\n"
     ]
    }
   ],
   "source": [
    "result_lr.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "result_lr.to_csv('result_lr.csv',index = False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import _pickle\n",
    "_pickle.dump(lr,open('lr.pkl','wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy is:  0.647682020999759\n",
      "Classification report for classifier LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,\n",
      "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
      "                   multi_class='warn', n_jobs=None, penalty='l1',\n",
      "                   random_state=None, solver='saga', tol=0.0001, verbose=0,\n",
      "                   warm_start=False):\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.66      0.61      0.63   3662762\n",
      "         1.0       0.64      0.69      0.66   3714656\n",
      "\n",
      "    accuracy                           0.65   7377418\n",
      "   macro avg       0.65      0.65      0.65   7377418\n",
      "weighted avg       0.65      0.65      0.65   7377418\n",
      "\n",
      "\n",
      "Confusion matrix:\n",
      "[[2233277 1429485]\n",
      " [1169712 2544944]]\n"
     ]
    }
   ],
   "source": [
    "import _pickle\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import confusion_matrix\n",
    "lr = _pickle.load(open('./pkl/lr.pkl','rb'))\n",
    "y_pred_lr = lr.predict(X)\n",
    "\n",
    "print(\"accuracy is: \",accuracy_score(y, y_pred_lr))\n",
    "\n",
    "print(\"Classification report for classifier %s:\\n%s\\n\"\n",
    "      % (lr, classification_report(y, y_pred_lr)))\n",
    "print(\"Confusion matrix:\\n%s\" % confusion_matrix(y, y_pred_lr))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred_lr = pd.DataFrame({'y_pred_lr':y_pred_lr})\n",
    "y_pred_lr.to_csv('y_pred_lr.csv',index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>target_lr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>0.582111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>0.605456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>0.609229</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>0.611934</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>0.589988</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           msno  \\\n",
       "0  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "2  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "3  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "4  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                        song_id  target_lr  \n",
       "0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=   0.582111  \n",
       "1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=   0.605456  \n",
       "2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=   0.609229  \n",
       "3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=   0.611934  \n",
       "4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=   0.589988  "
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_pred = lr.predict_proba(X)\n",
    "train_pred = [x[1] for x in train_pred]\n",
    "train_pred_lr = pd.DataFrame({'msno':train.msno,'song_id':train.song_id ,'target_lr':train_pred})\n",
    "train_pred_lr.to_csv('train_prob_lr.csv',index = False)\n",
    "train_pred_lr.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.41788942, 0.58211058],\n",
       "       [0.39454372, 0.60545628],\n",
       "       [0.39077055, 0.60922945],\n",
       "       ...,\n",
       "       [0.17361975, 0.82638025],\n",
       "       [0.69266362, 0.30733638],\n",
       "       [0.69401673, 0.30598327]])"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_pred "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
